{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Notebook [1]: First steps with cdQA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook shows how to use the `cdQA` pipeline to perform question answering on a custom dataset."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***Note:*** *If you are using colab, you will need to install `cdQA` by executing `!pip install cdqa` in a cell.*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-20T13:32:09.138284Z",
     "start_time": "2019-07-20T13:32:01.868622Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/andre.farias/python3.7.0/lib/python3.7/site-packages/tqdm/autonotebook/__init__.py:18: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
      "  \" (e.g. in jupyter console)\", TqdmExperimentalWarning)\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "from ast import literal_eval\n",
    "\n",
    "from cdqa.utils.filters import filter_paragraphs\n",
    "from cdqa.pipeline import QAPipeline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Download pre-trained reader model and example dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-20T13:33:36.002880Z",
     "start_time": "2019-07-20T13:32:10.618797Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Downloading BNP data...\n",
      "\n",
      "Downloading trained model...\n"
     ]
    }
   ],
   "source": [
    "from cdqa.utils.download import download_model, download_bnpp_data\n",
    "\n",
    "download_bnpp_data(dir='./data/bnpp_newsroom_v1.1/')\n",
    "download_model(model='bert-squad_1.1', dir='./models')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualize the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-20T13:35:00.377971Z",
     "start_time": "2019-07-20T13:34:59.764491Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>title</th>\n",
       "      <th>category</th>\n",
       "      <th>link</th>\n",
       "      <th>abstract</th>\n",
       "      <th>paragraphs</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>13.05.2019</td>\n",
       "      <td>The banking jobs : Assistant Vice President – ...</td>\n",
       "      <td>Careers</td>\n",
       "      <td>https://group.bnpparibas/en/news/banking-jobs-...</td>\n",
       "      <td>Within the Group’s Corporate and Institutional...</td>\n",
       "      <td>[I manage a team in charge of designing and im...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>13.05.2019</td>\n",
       "      <td>BNP Paribas at #VivaTech : discover the progra...</td>\n",
       "      <td>Innovation</td>\n",
       "      <td>https://group.bnpparibas/en/news/bnp-paribas-v...</td>\n",
       "      <td>From Thursday 16 to Saturday 18 May 2019, join...</td>\n",
       "      <td>[With François Hollande, Chairman of French fo...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>13.05.2019</td>\n",
       "      <td>\"The bank with an IT budget of more than EUR6 ...</td>\n",
       "      <td>Group</td>\n",
       "      <td>https://group.bnpparibas/en/news/the-bank-budg...</td>\n",
       "      <td>Interview with Jean-Laurent Bonnafé, Director ...</td>\n",
       "      <td>[We did the groundwork between 2012 and 2016, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10.05.2019</td>\n",
       "      <td>BNP Paribas at #VivaTech : discover the progra...</td>\n",
       "      <td>Innovation</td>\n",
       "      <td>https://group.bnpparibas/en/news/bnp-paribas-v...</td>\n",
       "      <td>From Thursday 16 to Saturday 18 May 2019, join...</td>\n",
       "      <td>[As part of the ‘United Tech of Europe’ theme,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10.05.2019</td>\n",
       "      <td>When Artificial Intelligence participates in r...</td>\n",
       "      <td>Careers</td>\n",
       "      <td>https://group.bnpparibas/en/news/artificial-in...</td>\n",
       "      <td>As the competition to attract talent intensifi...</td>\n",
       "      <td>[Online recruitment is already the norm. Accor...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         date                                              title    category  \\\n",
       "0  13.05.2019  The banking jobs : Assistant Vice President – ...     Careers   \n",
       "1  13.05.2019  BNP Paribas at #VivaTech : discover the progra...  Innovation   \n",
       "2  13.05.2019  \"The bank with an IT budget of more than EUR6 ...       Group   \n",
       "3  10.05.2019  BNP Paribas at #VivaTech : discover the progra...  Innovation   \n",
       "4  10.05.2019  When Artificial Intelligence participates in r...     Careers   \n",
       "\n",
       "                                                link  \\\n",
       "0  https://group.bnpparibas/en/news/banking-jobs-...   \n",
       "1  https://group.bnpparibas/en/news/bnp-paribas-v...   \n",
       "2  https://group.bnpparibas/en/news/the-bank-budg...   \n",
       "3  https://group.bnpparibas/en/news/bnp-paribas-v...   \n",
       "4  https://group.bnpparibas/en/news/artificial-in...   \n",
       "\n",
       "                                            abstract  \\\n",
       "0  Within the Group’s Corporate and Institutional...   \n",
       "1  From Thursday 16 to Saturday 18 May 2019, join...   \n",
       "2  Interview with Jean-Laurent Bonnafé, Director ...   \n",
       "3  From Thursday 16 to Saturday 18 May 2019, join...   \n",
       "4  As the competition to attract talent intensifi...   \n",
       "\n",
       "                                          paragraphs  \n",
       "0  [I manage a team in charge of designing and im...  \n",
       "1  [With François Hollande, Chairman of French fo...  \n",
       "2  [We did the groundwork between 2012 and 2016, ...  \n",
       "3  [As part of the ‘United Tech of Europe’ theme,...  \n",
       "4  [Online recruitment is already the norm. Accor...  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv', converters={'paragraphs': literal_eval})\n",
    "df = filter_paragraphs(df)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Instantiate the cdQA pipeline from a pre-trained reader model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-20T13:35:07.072516Z",
     "start_time": "2019-07-20T13:35:02.764545Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "QAPipeline(reader=BertQA(bert_model='bert-base-uncased', do_lower_case=True,\n",
       "                         fp16=False, gradient_accumulation_steps=1,\n",
       "                         learning_rate=3e-05, local_rank=-1, loss_scale=0,\n",
       "                         max_answer_length=30, n_best_size=20, no_cuda=False,\n",
       "                         null_score_diff_threshold=0.0, num_train_epochs=2,\n",
       "                         output_dir=None, predict_batch_size=8, seed=42,\n",
       "                         server_ip='', server_port='', train_batch_size=12,\n",
       "                         verbose_logging=False, version_2_with_negative=False,\n",
       "                         warmup_proportion=0.1))"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib')\n",
    "cdqa_pipeline.fit_retriever(df=df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Execute a query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-20T13:35:37.973867Z",
     "start_time": "2019-07-20T13:35:22.104799Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "3it [00:00, 931.93it/s]\n",
      "The pre-trained model you are loading is an uncased model but you have set `do_lower_case` to False. We are setting `do_lower_case=True` for you but you may want to check this behavior.\n"
     ]
    }
   ],
   "source": [
    "query = 'Since when does the Excellence Program of BNP Paribas exist?'\n",
    "prediction = cdqa_pipeline.predict(query)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Explore predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-06-25T14:21:26.561003Z",
     "start_time": "2019-06-25T14:21:26.549540Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "query: Since when does the Excellence Program of BNP Paribas exist?\n",
      "answer: January 2016\n",
      "title: BNP Paribas’ commitment to universities and schools\n",
      "paragraph: Since January 2016, BNP Paribas has offered an Excellence Program targeting new Master’s level graduates (BAC+5) who show high potential. The aid program lasts 18 months and comprises three assignments of six months each. It serves as a strong career accelerator that enables participants to access high-level management positions at a faster rate. The program allows participants to discover the BNP Paribas Group and its various entities in France and abroad, build an internal and external network by working on different assignments and receive personalized assistance from a mentor and coaching firm at every step along the way.\n"
     ]
    }
   ],
   "source": [
    "print('query: {}'.format(query))\n",
    "print('answer: {}'.format(prediction[0]))\n",
    "print('title: {}'.format(prediction[1]))\n",
    "print('paragraph: {}'.format(prediction[2]))"
   ]
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
